In [1]:
# Required Packages
import pandas as pd
import numpy as np

import json
import folium

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Los Angeles County Restaurant and Market Violations and Inspections

In this article, we use a dataset contains Environmental Health Violations for Restaurants and Markets in Los Angeles County. This dataset can be found here and here.

Los Angeles County Environmental Health is responsible for checking food violations for all unincorporated areas and 85 of the 88 cities in the County. This dataset does not include Pasadena, Long Beach or Vernon (each has its own city health department).

Loading Datasets

The data that has been processed before.

In [2]:
Violations_df = pd.read_csv('Data/LA/Clean_Violations_df.csv')
Inspections_df = pd.read_csv('Data/LA/Clean_Inspections_df.csv')
Data  = pd.read_csv('Data/LA/Data_Inspections_Violations_df.csv')
Violation_Dictionary_df  = pd.read_csv('Data/LA/Violation_Dictionary_df.csv')
Violations_matrix  = pd.read_csv('Data/LA/Violations_matrix_df.csv')
In [3]:
Data['Facility Zip'] = Data['Facility Zip'].astype(str)
Data['Facility Zip'] = Data['Facility Zip'].apply(lambda x: x[:5])

Geographical Analysis

In [4]:
Facilities_Geographical_df = Data.groupby('Facility Zip').agg(Average_Score=('Score', 'mean'),
                                                              Total_Facilities=('Facility ID', pd.Series.nunique)).reset_index()
Facilities_Geographical_df = Facilities_Geographical_df.rename(columns = {'Average_Score':'Average Score',
                                                                          'Total_Facilities':'Total Facilities'})
Facilities_Geographical_df = Facilities_Geographical_df.dropna()
Facilities_Geographical_df.head().style.hide_index() 
Out[4]:
Facility Zip Average Score Total Facilities
2337. 90.000000 1
60241 97.000000 1
90001 90.581662 277
90002 92.362390 64
90003 90.646483 265

Next, we can export a geojson file from here. However, we would like to reduce the size of this file for the sake of having an optimal computation.

In [5]:
# loading the GeoJSON file
with open('Data/LA/LA_ZIP_Codes.geojson', 'r') as jsonFile:
    Zipcode_Data = json.load(jsonFile)

# Creating a list from Zip codes
mylist = Facilities_Geographical_df['Facility Zip'].unique().tolist()

# removing ZIP codes that are not in our dataset
temp = []
for i in range(len(Zipcode_Data['features'])):
    if Zipcode_Data['features'][i]['properties']['name'] in mylist:
        temp.append(Zipcode_Data['features'][i])

# creating a new JSON file
Reduced_Zipcode_Data = dict.fromkeys(['type','features'])
Reduced_Zipcode_Data['type'] = 'FeatureCollection'
Reduced_Zipcode_Data['features'] = temp

del Zipcode_Data, mylist, temp
# save the JSON file
open("Data/LA/reduced_LA_ZIP_Codes.json", "w").write(json.dumps(Reduced_Zipcode_Data,
                                                             sort_keys=True, indent=4, separators=(',', ': ')))
del Reduced_Zipcode_Data
In [6]:
def plot_map(Inp_Column, Text_Legend = '', Inp_Df = Facilities_Geographical_df, Zoom_Level=8):

    # reading of the updated GeoJSON file
    Geographical_Data = r'Data/LA/reduced_LA_ZIP_Codes.json'

    # initiating a Folium map
    m = folium.Map(location = [34.052, -118.243], zoom_start = Zoom_Level)

    # creating a map
    m.choropleth(geo_data = Geographical_Data, fill_opacity = 0.8, line_opacity = 0.2,
        data = Inp_Df, key_on = 'feature.properties.name', columns = ['Facility Zip', Inp_Column],
        fill_color = 'RdYlGn', legend_name = Text_Legend)
    folium.LayerControl().add_to(m)
    
    # Show the map
    return m
In [7]:
plot_map('Average Score', 'The Average Score of Facilities')
Out[7]:
In [8]:
plot_map('Total Facilities','Total Facilities')
Out[8]:

Code Violations by Each Facility

Frist, let's addd the Violation Matrix to our Data

In [9]:
Data=pd.merge(Data, Violations_matrix, on='Facility ID', how='right')

Moreover, previously, we found the following violation code as the most correlated ones.

'F037', 'F040', 'F007', 'F044', 'MF34', 'W007', 'W035', 'MF41', 'MF45', 'W017', 'W021', 'W034', 'MF31', 'W033', 'MF38', 'W032', 'MF15', 'MF08', 'F035', 'F033', 'W044', 'MF36', 'W011', 'W031'

Thus,

In [10]:
Facility_Violations_df = Data.groupby(['Facility Zip','Facility ID']).agg({'F037': np.mean,'F040': np.mean,'F007': np.mean,
                        'F044': np.mean,'MF34': np.mean,'F037': np.mean,'W007': np.mean,'W035': np.mean,'MF41': np.mean,
                        'W017': np.mean,'W021': np.mean,'W034': np.mean,'MF31': np.mean,'W033': np.mean,'MF38': np.mean,
                        'W032': np.mean,'MF15': np.mean,'MF08': np.mean,'F035': np.mean,'W044': np.mean,'MF36': np.mean,
                        'W011': np.mean,'W031': np.mean})
Facility_Violations_df = Facility_Violations_df.groupby(level=0).mean()
Facility_Violations_df.reset_index(inplace=True)
In [11]:
Facility_Violations_df.head().style.hide_index()
Out[11]:
Facility Zip F037 F040 F007 F044 MF34 W007 W035 MF41 W017 W021 W034 MF31 W033 MF38 W032 MF15 MF08 F035 W044 MF36 W011 W031
2337. 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
60241 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
90001 1.180505 0.898917 0.664260 2.274368 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.613718 0.000000 0.000000 0.000000 0.000000
90002 0.781250 0.765625 0.421875 2.031250 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.281250 0.000000 0.000000 0.000000 0.000000
90003 1.022642 1.022642 0.652830 2.683019 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.083019 0.000000 0.000000 0.000000 0.000000

Here, we only plot the following columns

In [12]:
temp = Facility_Violations_df[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['F007', 'F035']

Therefore,

In [13]:
vCode = 'F037'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[13]:
In [14]:
vCode = 'F040'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[14]:
In [15]:
vCode = 'F007'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[15]:
In [16]:
vCode = 'F044'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[16]:
In [17]:
vCode = 'F035'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
Out[17]:

Program Elements: Seats

In [18]:
PE_Seats_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Seats']).size().unstack(1)).reset_index()
PE_Seats_Counts.head().style.hide_index()
Out[18]:
Facility Zip 0-10 SQ. FT. 0-1999 SQ. FT. 0-30 SQ. FT. 0-999 SQ. FT. 1-1999 SQ. FT. 1-4999 SQ. FT. 1000-1999 SQ. FT. 10000+ SQ. FT. 151+ SQ. FT. 2000+ SQ. FT. 2000-3999 SQ. FT. 2000-4999 SQ. FT. 2000-5999 SQ. FT. 31-60 SQ. FT. 4000-9999 SQ. FT. 6000+ SQ. FT. Other
2337. nan nan 6.000000 nan nan nan nan nan nan nan nan nan nan nan nan nan nan
60241 nan nan 3.000000 nan nan nan nan nan nan nan nan nan nan nan nan nan nan
90001 nan nan 2766.000000 nan 1556.000000 nan nan nan 142.000000 436.000000 nan nan nan 876.000000 nan nan nan
90002 nan nan 431.000000 nan 395.000000 nan nan nan 19.000000 102.000000 nan nan nan 60.000000 nan nan nan
90003 nan nan 2904.000000 nan 2293.000000 nan nan nan nan 237.000000 nan nan nan 443.000000 nan nan nan

We would like to plot only the following columns:

In [19]:
temp = PE_Seats_Counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['0-30 SQ. FT.']
In [20]:
plot_map('0-30 SQ. FT.', 'Facility Total Count (0-30 SQ. FT.)' ,PE_Seats_Counts)
Out[20]:
In [21]:
plot_map('1-1999 SQ. FT.', 'Facility Total Count (1-1999 SQ. FT.)' ,PE_Seats_Counts)
Out[21]:
In [22]:
plot_map('151+ SQ. FT.', 'Facility Total Count (151+ SQ. FT.)' ,PE_Seats_Counts)
Out[22]:
In [23]:
plot_map('31-60 SQ. FT.', 'Facility Total Count (31-60 SQ. FT.)' ,PE_Seats_Counts)
Out[23]:

Program Elements: Type

In [24]:
PE_type_counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Type']).size().unstack(1)).reset_index()
PE_type_counts.head().style.hide_index()
Out[24]:
Facility Zip Caterer Food Market Retail Food Market Wholesale Food Processing Wholesale Food Vehicle Commissary Food Warehouse Interim Housing FF LIC HTH Care Food FAC Private School Cafeteria Restaurant Swap Meet Prepackaged Food Stand
2337. nan nan nan nan nan nan nan nan nan 6.000000 nan
60241 nan nan nan nan nan nan nan nan nan 3.000000 nan
90001 nan 1992.000000 nan nan nan nan nan nan nan 3843.000000 nan
90002 nan 497.000000 nan nan nan nan nan nan nan 524.000000 nan
90003 nan 2530.000000 nan nan nan nan nan nan nan 3498.000000 nan

We only plot the following columns.

In [25]:
temp = PE_type_counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
['Restaurant']
In [26]:
plot_map('Food Market Retail', 'PE Type: Food Market Retail' ,PE_type_counts)
Out[26]:
In [27]:
plot_map('Restaurant', 'PE Type: Restaurant' ,PE_type_counts)
Out[27]:

Program Elements: Risk

In [28]:
PE_Risk_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Risk']).size().unstack(1)).reset_index()
PE_Risk_Counts.head().style.hide_index()
Out[28]:
Facility Zip High Risk Low Risk Moderate Risk
2337. 6.000000 nan nan
60241 nan nan 3.000000
90001 3803.000000 1013.000000 1019.000000
90002 474.000000 209.000000 338.000000
90003 3300.000000 1546.000000 1182.000000
In [29]:
plot_map('Low Risk', 'PE Risk: Low Risk' ,PE_Risk_Counts)
Out[29]:
In [30]:
plot_map('Moderate Risk', 'PE Risk: Moderate Risk' ,PE_Risk_Counts)
Out[30]:
In [31]:
plot_map('High Risk', 'PE Risk: High Risk' ,PE_Risk_Counts)
Out[31]: